library(DBI)
library(tidyverse)
library(lubridate)
Set up function to coerce numeric strings to hms class with as_hms
asTime <- function(dateString, units = NULL) {
strptime(dateString, "%H%M") %>%
round(units) %>%
format("%H:%M:%S") %>%
hms::as_hms()
}
Set up and open connection to the database
if (file.exists("flights.db"))
file.remove("flights.db")
## [1] TRUE
conn <- dbConnect(RSQLite::SQLite(), "flights.db")
Load and write data to database
carriers <- read.csv("carriers.csv", header = TRUE)
airports <- read.csv("airports.csv", header = TRUE)
plane_data <- read.csv("plane-data.csv", header = TRUE)
dbWriteTable(conn, "airports", airports)
dbWriteTable(conn, "carriers", carriers)
dbWriteTable(conn, "plane_data", plane_data)
for (i in c(1995:2000)) {
fd <- read.csv(paste0(i, ".csv"), header = TRUE)
if (i == 1995) {
dbWriteTable(conn, "flight_data", fd, overwrite = TRUE)
} else {
dbWriteTable(conn, "flight_data", fd, append = TRUE)
}
}
Create local reference to database
flight_data <- tbl(conn, "flight_data")
glimpse(flight_data)
## Rows: ??
## Columns: 29
## Database: sqlite 3.36.0 [C:\Users\Joseph\Joshua PFDS\r\coursework\data\flights.db]
## $ Year <int> 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995~
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ DayofMonth <int> 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ~
## $ DayOfWeek <int> 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1~
## $ DepTime <int> 657, 648, 649, 645, 645, 646, NA, 644, 644, 643, 642~
## $ CRSDepTime <int> 645, 645, 645, 645, 645, 645, 645, 645, 645, 645, 64~
## $ ArrTime <int> 952, 938, 932, 928, 931, 929, NA, 953, 938, 940, 935~
## $ CRSArrTime <int> 937, 937, 937, 937, 937, 937, 937, 937, 937, 937, 93~
## $ UniqueCarrier <chr> "UA", "UA", "UA", "UA", "UA", "UA", "UA", "UA", "UA"~
## $ FlightNum <int> 482, 482, 482, 482, 482, 482, 482, 482, 482, 482, 48~
## $ TailNum <chr> "N7298U", "N7449U", "N7453U", "N7288U", "N7275U", "N~
## $ ActualElapsedTime <int> 115, 110, 103, 103, 106, 103, NA, 129, 114, 117, 113~
## $ CRSElapsedTime <int> 112, 112, 112, 112, 112, 112, 112, 112, 112, 112, 11~
## $ AirTime <int> 83, 88, 83, 84, 82, 85, 45, 110, 94, 99, 93, 90, 86,~
## $ ArrDelay <int> 15, 1, -5, -9, -6, -8, NA, 16, 1, 3, -2, 0, -1, 39, ~
## $ DepDelay <int> 12, 3, 4, 0, 0, 1, NA, -1, -1, -2, -3, 7, 0, 26, -1,~
## $ Origin <chr> "ORD", "ORD", "ORD", "ORD", "ORD", "ORD", "ORD", "OR~
## $ Dest <chr> "PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PH~
## $ Distance <int> 678, 678, 678, 678, 678, 678, 678, 678, 678, 678, 67~
## $ TaxiIn <int> 7, 5, 3, 3, 6, 5, 6, 5, 5, 3, 7, 5, 7, 10, 6, 4, 4, ~
## $ TaxiOut <int> 25, 17, 17, 16, 18, 13, 10, 14, 15, 15, 13, 10, 18, ~
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ CancellationCode <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ CarrierDelay <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
## $ WeatherDelay <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
## $ NASDelay <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
## $ SecurityDelay <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
## $ LateAircraftDelay <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
Before proceeding with the analysis:
Check for the number of missing values in each column
missing_values <- flight_data %>%
summarise(across(everything(), ~ sum(is.na(.)))) %>%
collect() %>%
t() %>%
as.data.frame() %>%
rename(count = 1) %>%
filter(count > 0) %>%
arrange(count)
print(missing_values)
## count
## Distance 5987
## CRSElapsedTime 24391
## DepTime 804514
## DepDelay 804514
## AirTime 830130
## ArrTime 882178
## ActualElapsedTime 882178
## ArrDelay 882178
## CancellationCode 32686913
## CarrierDelay 32686913
## WeatherDelay 32686913
## NASDelay 32686913
## SecurityDelay 32686913
## LateAircraftDelay 32686913
Visualize the missing values on a bar chart and remove any variables with, NA = total number of observations
total_obs <- flight_data %>%
count() %>%
collect()
missing_values %>%
rownames_to_column() %>%
filter(count < as.numeric(total_obs)) %>%
ggplot() +
geom_bar(aes(x=rowname, y=count), stat = 'identity') +
labs(x='Variable', y="Number of Missing Values",
title='Bar Chart of Missing Values') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
There might be some relationship between the missing values across the variables. Intuitively, canceled or diverted flights might explain the missing values.
Check the number of canceled or diverted flights
flight_data %>%
count(Cancelled == 1)
flight_data %>%
count(Cancelled == 1 | Diverted == 1 )
There is a relationship between the number of canceled or diverted flights and the missing values. More specifically, canceled and diverted flight causes missing values in departure and arrival time.
Hence, remove the records with missing values
flight_data <- flight_data %>%
filter(Cancelled == 0, Diverted == 0)
Drop columns that provide little to no information for further analysis
fd_mod <- flight_data %>%
select(-c(Cancelled, Diverted, CancellationCode, CarrierDelay,
WeatherDelay, NASDelay, SecurityDelay, LateAircraftDelay)) %>%
collect()
# FlightNum, ActualElapsedTime, CRSElapsedTime, AirTime, TaxiIn, TaxiOut
Make modifications to data where necessary:
DepDel15: binary indicator* for flights with departure delay time >= 15 mins
ArrDel15: binary indicator* for flights with arrival delay time >= 15 mins
*(0 = not delayed, 1 = delayed)
fd_mod <- fd_mod %>%
mutate(flight_date = with(fd_mod, paste(Year, Month, DayofMonth, sep = "/")),
season = ifelse(Month %in% 3:5, "Spring",
ifelse(Month %in% 6:8, "Summer",
ifelse(Month %in% 9:11, "Autumn", "Winter"))) %>%
factor(levels = c("Winter", "Spring", "Summer", "Autumn")),
DepDel15 = ifelse(DepDelay >= 15, 1, 0),
ArrDel15 = ifelse(ArrDelay >= 15, 1, 0)) %>%
collect()
glimpse(fd_mod)
## Rows: 31,804,735
## Columns: 25
## $ Year <int> 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995~
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ DayofMonth <int> 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, ~
## $ DayOfWeek <int> 5, 6, 7, 1, 2, 3, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2~
## $ DepTime <int> 657, 648, 649, 645, 645, 646, 644, 644, 643, 642, 65~
## $ CRSDepTime <int> 645, 645, 645, 645, 645, 645, 645, 645, 645, 645, 64~
## $ ArrTime <int> 952, 938, 932, 928, 931, 929, 953, 938, 940, 935, 93~
## $ CRSArrTime <int> 937, 937, 937, 937, 937, 937, 937, 937, 937, 937, 93~
## $ UniqueCarrier <chr> "UA", "UA", "UA", "UA", "UA", "UA", "UA", "UA", "UA"~
## $ FlightNum <int> 482, 482, 482, 482, 482, 482, 482, 482, 482, 482, 48~
## $ TailNum <chr> "N7298U", "N7449U", "N7453U", "N7288U", "N7275U", "N~
## $ ActualElapsedTime <int> 115, 110, 103, 103, 106, 103, 129, 114, 117, 113, 10~
## $ CRSElapsedTime <int> 112, 112, 112, 112, 112, 112, 112, 112, 112, 112, 11~
## $ AirTime <int> 83, 88, 83, 84, 82, 85, 110, 94, 99, 93, 90, 86, 93,~
## $ ArrDelay <int> 15, 1, -5, -9, -6, -8, 16, 1, 3, -2, 0, -1, 39, 20, ~
## $ DepDelay <int> 12, 3, 4, 0, 0, 1, -1, -1, -2, -3, 7, 0, 26, -1, 8, ~
## $ Origin <chr> "ORD", "ORD", "ORD", "ORD", "ORD", "ORD", "ORD", "OR~
## $ Dest <chr> "PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PH~
## $ Distance <int> 678, 678, 678, 678, 678, 678, 678, 678, 678, 678, 67~
## $ TaxiIn <int> 7, 5, 3, 3, 6, 5, 5, 5, 3, 7, 5, 7, 10, 6, 4, 4, 3, ~
## $ TaxiOut <int> 25, 17, 17, 16, 18, 13, 14, 15, 15, 13, 10, 18, 22, ~
## $ flight_date <chr> "1995/1/6", "1995/1/7", "1995/1/8", "1995/1/9", "199~
## $ season <fct> Winter, Winter, Winter, Winter, Winter, Winter, Wint~
## $ DepDel15 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0~
## $ ArrDel15 <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0~
Write changes to database
dbWriteTable(conn, "flight_data", fd_mod, overwrite = TRUE)
flight_data <- tbl(conn, "flight_data")
The columns DepDel15 and ArrDel15 are binary indicators for departure and arrival delays.
The term “delay” in this analysis is classified where time delayed exceeds a grace period of 15 minutes. Despite having late departures, flights may arrive on time. Hence ArrDel15 (arrival delay >= 15) will be used to analyze when is best to fly to minimize delays.
arrdel_htmap <- dbGetQuery(conn,
"SELECT Year, Month, DayOfWeek, flight_date, ArrDel15
FROM flight_data
WHERE ArrDel15 == 1")
arrdel_htmap <- arrdel_htmap %>%
mutate(Month = Month %>% factor(labels = month.abb),
DayOfWeek = DayOfWeek %>% factor(levels = rev(c(7, 1, 2, 3, 4, 5, 6)),
labels = rev(c("Sun", "Mon", "Tue", "Wed",
"Thu", "Fri", "Sat"))),
flight_date = flight_date %>% as.Date("%Y/%m/%d"),
WeekofMonth = stringi::stri_datetime_fields(flight_date)$WeekOfMonth) %>%
group_by(Year, Month, WeekofMonth, DayOfWeek) %>%
summarise(total_delay = n())
arrdel_htmap %>%
ggplot(aes(WeekofMonth, DayOfWeek, fill = total_delay)) +
geom_tile(colour = "black") +
labs(title = "Arrival Delays Overview",
x = "Month / Week of Month",
y = "Year / Day of Week") +
theme_grey() +
theme(plot.title = element_text(size = 15, hjust = 0.5)) +
scale_fill_gradient(low = "white", high = "red") +
scale_x_discrete(limits = factor(1:5)) +
facet_grid(Year ~ Month) +
guides(fill = guide_legend(title = "Arrival Delays"))
The color gradient denotes the number of arrival delays. By observation, September to November has the least number of arrival delays.
arrdel_dom <- dbGetQuery(conn,
"SELECT DayofMonth, season, AVG(ArrDel15) AS per_del
FROM flight_data
GROUP BY DayofMonth, season")
arrdel_dom %>%
ggplot(aes(DayofMonth, per_del, group = season)) +
geom_line(aes(color = season), size = 1) +
labs(title = "Percentage of Arrival Delay by Season",
x = "Day of Month",
y = "Percentage of Arrival Delay") +
theme_grey() +
theme(plot.title = element_text(size = 15, hjust = 0.5)) +
scale_x_discrete(limits = factor(1:31)) +
scale_y_continuous(labels = scales::percent)
Autumn (the most bottom line) has the lowest percentage of flights delayed on arrival and hence is the Best Season to fly.
arrdel_mth <- dbGetQuery(conn,
"SELECT Month, AVG(ArrDel15) AS per_del
FROM flight_data
GROUP BY Month")
arrdel_mth %>%
ggplot(aes(Month, per_del)) +
geom_bar(stat = "identity", aes(fill = per_del)) +
geom_text(aes(label = paste(round(per_del, 4)*100, "%")),
size = 3, vjust = -1) +
scale_fill_gradientn(colors = rev(colorspace::heat_hcl(3))) +
labs(title = "Percentage of Arrival Delay by Month",
x = "Month",
y = "Percentage of Arrival Delay") +
theme_grey() +
theme(plot.title = element_text(size = 15, hjust = 0.5)) +
scale_x_discrete(limits = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec")) +
scale_y_continuous(labels = scales::percent) +
guides(fill = guide_legend(title = "Percentage"))
The best Month(s) to fly is in May (Spring), September, and October (Autumn), with the probability of flights delay on arrival at 19.77%, 16.54%, and 18.71%, respectively.
arrdel_wk <- dbGetQuery(conn,
"SELECT Month, flight_date, ArrDel15
FROM flight_data")
arrdel_wk %>%
mutate(flight_date = flight_date %>% as.Date("%Y/%m/%d"),
WeekofMonth = stringi::stri_datetime_fields(flight_date)$WeekOfMonth) %>%
group_by(Month, WeekofMonth) %>%
summarise(per_del = mean(ArrDel15)) %>%
ggplot(aes(WeekofMonth, per_del)) +
geom_bar(stat = "identity", aes(fill = per_del)) +
geom_text(aes(label = paste(round(per_del, 4)*100, "%")),
size = 2.8, hjust = 0.8, angle = 90) +
scale_fill_gradientn(colors = rev(colorspace::heat_hcl(2))) +
labs(title = "Percentage of Arrival Delay by Week by Month",
x = "Week of Month",
y = "Percentage of Arrival Delay") +
theme_grey() +
theme(plot.title = element_text(size = 15, hjust = 0.5)) +
scale_x_discrete(limits = c("w1", "w2", "w3", "w4", "w5", "w6")) +
scale_y_continuous(labels = scales::percent) +
guides(fill = guide_legend(title = "Percentage")) +
facet_wrap(~ Month, labeller = as_labeller(c(`1` = "Jan", `2` = "Feb", `3` = "Mar",
`4` = "Apr", `5` = "May", `6` = "Jun",
`7` = "Jul", `8` = "Aug", `9` = "Sep",
`10` = "Oct", `11` = "Nov", `12` = "Dec")))
During the Autumn season, the first week of the Month is when it is best to fly. Statistically, the average percentage of arrival delays in the first week of the Autumn Season is 17.27%.
arrdel_dy <- dbGetQuery(conn,
"SELECT DayOfWeek, season, AVG(ArrDel15) AS per_del
FROM flight_data
GROUP BY DayOfWeek, season")
arrdel_dy %>%
ggplot(aes(DayOfWeek, per_del)) +
geom_bar(stat = "identity", aes(fill = per_del)) +
geom_text(aes(label = paste(round(per_del, 4)*100, "%")),
vjust = -1, size = 3) +
scale_fill_gradientn(colors = rev(colorspace::heat_hcl(4))) +
labs(title = "Percentage of Arrival Delay by Day of Week",
x = "Day of Week",
y = "Percentage of Arrival Delay") +
theme_grey() +
theme(plot.title = element_text(size = 15, hjust = 0.5)) +
scale_x_discrete(limits = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")) +
scale_y_continuous(labels = scales::percent) +
guides(fill = guide_legend(title = "Percentage")) +
facet_wrap(~ season)
Diving deeper into the best day of the week to fly, the Saturdays of the Autumn season has percentage flights delayed on average 14.10%. Generally speaking, Saturdays are the best day of the week to fly regardless of the season.
Summary:
Winter: Saturday - 21.79%
Spring: Saturday - 16.85%
Summer: Saturday - 20.69%
Autumn: Saturday - 14.10%
Taking the CRSDepTime (Scheduled Departure Time) and binned every hour as a group. (24 groups in total)
arrdel_tm <- dbGetQuery(conn,
"SELECT CRSDepTime, ArrDel15
FROM flight_data
WHERE ArrDel15 == 1")
arrdel_tm <- arrdel_tm %>%
mutate(CRSDepTime = sprintf("%04d", arrdel_tm$CRSDepTime) %>%
asTime(units = "hours")) %>%
group_by(CRSDepTime) %>%
summarise(delay_count = n())
# Lets create a visualization - Number of Arrival Delays versus CRSDepTime
arrdel_tm %>%
ggplot(aes(x = CRSDepTime, y = delay_count)) +
geom_line(color="#69b3a2", size = 1.5) +
labs(title = "Arrival Delays versus Scheduled Departure Time",
x = "Schedule Departure Time",
y = "Number of Arrival Delays") +
theme_grey() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
plot.title = element_text(size = 15, hjust = 0.5)) +
scale_x_time(labels = scales::label_time(format = "%H:%M"),
breaks = scales::date_breaks("1 hour"))
By observation, flight delays were low in the early morning and increased after 10:00 hours. Between 17:00 - 20:00 has the highest number of flights delayed, with the numbers declining towards midnight.
Reduce the odds of flight delays by flying in the morning and avoiding flights that depart during 16:00 - 19:00 hours.
There are multiple factors besides its chronological age to consider when evaluating the age of a plane. This analysis will take 11 years, the average age of a U.S. commercial aircraft as a guide for “old” planes.
neg_age: proxy column for filtering invalid plane ages
age: dummy variable for age of the planes (0 = old planes, 1 = new planes.)
plane_date <- dbGetQuery(conn,
"SELECT flight_date, issue_date, ArrDel15
FROM (plane_data LEFT JOIN flight_data
ON plane_data.tailnum = flight_data.TailNum)
WHERE ArrDel15 >= 0")
# Coerce dates to date class
plane_date <- plane_date %>%
mutate(flight_date = flight_date %>% as.Date("%Y/%m/%d"),
issue_date = issue_date %>% as.Date("%m/%d/%Y"))
plane_date <- plane_date %>%
mutate(neg_age = ifelse(time_length(difftime(plane_date$flight_date,
plane_date$issue_date), "years") < 0, NA, 1),
age = ifelse(time_length(difftime(plane_date$flight_date,
plane_date$issue_date), "years") <= 11, 1, 0)) %>% filter(!is.na(neg_age))
glimpse(plane_date)
## Rows: 7,337,291
## Columns: 5
## $ flight_date <date> 1995-01-02, 1995-01-03, 1995-01-07, 1995-01-08, 1995-01-0~
## $ issue_date <date> 1992-04-10, 1992-01-06, 1992-05-27, 1987-03-27, 1991-08-0~
## $ ArrDel15 <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0~
## $ neg_age <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ age <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
Create a contingency table
obsvtn <- plane_date %>%
select(ArrDel15 , age) %>%
mutate(age = factor(plane_date$age, c(0, 1), labels = c("old", "new")),
ArrDel15 = factor(plane_date$ArrDel15, c(0:1), labels = c("On-time", "Delayed")))
xtbl_count <- table(obsvtn$age, obsvtn$ArrDel15)
print(xtbl_count)
##
## On-time Delayed
## old 699614 198162
## new 5009140 1430375
# Row percentages
xtbl_per <- prop.table(xtbl_count, 1)
Visualize the information on a side-by-side bar chart
xtbl_per %>%
as.data.frame() %>%
ggplot(aes(x = Var2, y = Freq, fill = Var1)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = scales::percent(Freq)),
vjust = -0.5, size = 4, position = position_dodge(width = 0.9)) +
labs(title = "Percentage of Planes by Flight Performance grouped by Aircraft Age",
x = "Flight Perfomance",
y = "Percentage of Planes") +
theme_grey() +
theme(plot.title = element_text(size = 15, hjust = 0.5)) +
guides(fill = guide_legend(title = "Plane")) +
scale_y_continuous(labels = scales::percent)
Based on the visualization, there appears to be no association between the plane age and flight performance. To further confirm this result, we check for any statistical significance between the plane age and flight performance using the Chi-square test for association.
H0: There is no association between plane age and flight performance
H1: There is such an association
chisq.test(xtbl_count)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: xtbl_count
## X-squared = 8.9246, df = 1, p-value = 0.002814
As the p-value = 0.28%, the null hypothesis is rejected at the 1% significance level. The results are highly significant and provide strong evidence for rejecting the null hypothesis to conclude an association between plane age and flight performance.
Although the plane age and flight performance are statistically significant, its graphical visualization for the association is similar for both the age group, with only a 0.14% difference.
In conclusion, the association between plane age and flight performance is small, consistent, and biologically insignificant. Hence, it is unlikely that older planes do suffer from more delays.
There is no information on the number of passengers abroad on each plane. Hence, the number of flights is used as a proxy for the indication of popularity.
Check the airport with the most outbound flights
fd_routes <- dbGetQuery(conn,
"SELECT Year, Month, season, Origin, Dest
FROM flight_data")
fd_routes %>%
group_by(Year, Origin) %>%
summarise(count = n()) %>%
arrange(desc(count))
ORD - Chicago O’Hare International airport has the highest number of outbound flights and hence is used as the sampling frame for this analysis.
fd_routes %>%
mutate(route = with(fd_routes, paste(Origin, Dest, sep = "-")),
date = with(fd_routes, paste(Year, Month, sep = "-")) %>% zoo::as.yearmon()) %>%
filter(Origin == "ORD") %>%
group_by(date, route) %>%
summarise(count = n()) %>%
ggplot(aes(x = date, y = count)) +
geom_line() +
labs(title = "Number of Outbound Flights from ORD airport by Destination",
x = "Date",
y = "Number of Flights") +
theme_grey() +
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
facet_wrap(~route, ncol = 13)
The visualization illustrates the trend of outgoing flights from ORD (Chicago) over the years, from 1995 to 2000 (left to right). The majority of the routes showed consistent trends.
Flights from Chicago to Minneapolis (ORD-MSP) have been consistently decreasing over the years, and flights from Chicago to Philadelphia (ORD-PHL) have been increasing. Meanwhile, flights to Seattle (ORD-SEA) are low at the beginning and end of each year and highest mid-year.
This result is exclusively for flights outbound from ORD airport, which does not make a good representation of all the airports. However, the same analysis using different origin airports of interest can be re-performed to uncover the flight patterns for the airport.
Analyze for cascading delays between the top 10 busiest airports in the USA in terms of flight frequency.
Flight frequency for the top 10 busiest airports/ cities
bz_airports <- dbGetQuery(conn,
"SELECT Origin AS iata, city as airport_city, COUNT(*) AS count
FROM flight_data, airports
WHERE airports.iata = flight_data.Origin
GROUP BY iata, airport
ORDER BY count DESC
Limit 10")
print(bz_airports)
## iata airport_city count
## 1 ORD Chicago 1704089
## 2 ATL Atlanta 1472688
## 3 DFW Dallas-Fort Worth 1464427
## 4 LAX Los Angeles 1106365
## 5 STL St Louis 1023369
## 6 PHX Phoenix 1006169
## 7 DTW Detroit 882485
## 8 MSP Minneapolis 812323
## 9 DEN Denver 789971
## 10 SFO San Francisco 786089
lagged_del: lag 1-period departure delay time
lagged_del <- dbGetQuery(conn,
"SELECT Origin, Dest, CRSDepTime, DepTime, DepDelay
FROM flight_data
WHERE CRSDepTime > 1900
AND CRSDepTime <= 2200
ORDER BY Origin, DepTime")
lagged_del <- lagged_del %>%
group_by(Origin) %>%
mutate(depdelay_lag = lag(DepDelay)) %>%
filter(!is.na(depdelay_lag))
print(lagged_del)
## # A tibble: 4,157,497 x 6
## # Groups: Origin [169]
## Origin Dest CRSDepTime DepTime DepDelay depdelay_lag
## <chr> <chr> <int> <int> <int> <int>
## 1 ABE MDT 2030 39 249 244
## 2 ABE MDT 2030 47 257 249
## 3 ABE DTW 2005 1400 1075 257
## 4 ABE MDT 1915 1908 -7 1075
## 5 ABE MDT 1915 1909 -6 -7
## 6 ABE DTW 1915 1910 -5 -6
## 7 ABE DTW 1915 1910 -5 -5
## 8 ABE DTW 1915 1910 -5 -5
## 9 ABE DTW 1915 1910 -5 -5
## 10 ABE MDT 1915 1910 -5 -5
## # ... with 4,157,487 more rows
lagged_del %>%
filter(Origin == "ORD") %>%
group_by(depdelay_lag) %>%
summarise(depdelay_mean = mean(DepDelay)) %>%
ggplot(aes(y = depdelay_mean, x = depdelay_lag)) +
geom_point() +
labs(y = "Departure Delay (mins)", x = "Previous Departure Delay (mins)") +
scale_x_continuous(breaks = seq(0, 1500, by = 120))
The above scatter diagram illustrates the relationship (positive) between the previous delay and the subsequent flights’ departure delay time (avg.) for flights outbound from Chicago O’Hare International airport between 19:00 – 22:00 hours.
Indicated by the increase in variability, the strength of the relationship cools off around the 480 (mins) mark. Suggesting that flights’ ability to depart on time increases with the duration delayed for the previous flight since flights with longer delays can intersperse with flights leaving on time.
lagged_del %>%
filter(Origin %in% bz_airports$iata) %>%
group_by(Origin, depdelay_lag) %>%
summarise(depdelay_mean = mean(DepDelay)) %>%
ggplot(aes(y = depdelay_mean, x = depdelay_lag)) +
geom_point() +
labs(y = "Departure Delay (mins)", x = "Previous Departure Delay (mins)") +
scale_x_continuous(breaks = seq(-750, 1500, by = 120)) +
facet_wrap(~ Origin, ncol=2)
Since flight schedules are aligned between the origin and destination of a flight, the impact of cascading delays in one airport on another airport can be interpreted implicitly through the relationship between previous delays and the subsequent flights’ departure delay time.
dbDisconnect(conn)